/* --------------------------------------------------------------  */
/* (C)Copyright 2001,2006,                                         */
/* International Business Machines Corporation,                    */
/* Sony Computer Entertainment, Incorporated,                      */
/* Toshiba Corporation,                                            */
/*                                                                 */
/* All Rights Reserved.                                            */
/* --------------------------------------------------------------  */
/* PROLOG END TAG zYx                                              */
#include "../common.h"
#include <spu_mfcio.h>
#include <spu_intrinsics.h>
#include <stdio.h>
#include <math.h>
#include <stdbool.h>
#include <string.h>

//unsigned char parameter_area[128] __attribute__ ((aligned (128)));
arguments_union_t args __attribute__ ((aligned (128)));
uint32_t tags[8];

void waitfor_matrix_io ( int tag ) {
	mfc_write_tag_mask(1<<tag);
	mfc_read_tag_status_all();   // Wait for the data array DMA to complete.
}

complex_float_t jacobi(const float a, const float b, const float g)
{
	complex_float_t x;

	float w = (b - a) / (2.0 * g);

	float t = sign(w) / (fabs(w) + sqrtf(1.0 + w*w));
	x.real = 1.0 / sqrtf(1.0 + t*t);
	x.imag = t * x.real;

	debug2("t = %f / (%f + sqrt(1.0 + %f*%f)) = %f\n", sign(w), fabs(w), w, w, t);

	return x;
}

bool hestenes_jacobi_calculation(unsigned long long matrix, const size_t lb_i, const size_t ub_i, 
	const size_t lb_j, const size_t ub_j,
	complex_float_matrix_t P_matrix, const float delta,
	const size_t Nexpanded, const size_t R)
{
	debug1("hestenes_jacobi_calculation [%d, %d] x [%d, %d]\n", lb_i, ub_i, lb_j, ub_j);

	bool converged = true;
	size_t i, j, kouter, kinner;
	float a, b, g;
	unsigned long long basei, basej;
	vector float localRowi1[8] __attribute__((aligned(128)));
	vector float localRowj1[8] __attribute__((aligned(128)));
	vector float localRowi2[8] __attribute__((aligned(128)));
	vector float localRowj2[8] __attribute__((aligned(128)));
	vector float tmpTheta_i, tmpTheta_j, tmpG, data_i, data_j;
	float G_matrix[R][R];

	// Vectors of norms for row block i and row block j
	float theta_i[R];
	float theta_j[R];

	// TBD: Use memset on each matrix/vector to initialize
	memset(G_matrix, 0, R * R * sizeof(float));
	memset(P_matrix, 0, R * R * sizeof(complex_float_t));
	memset(theta_i, 0, R * sizeof(float));
	memset(theta_j, 0, R * sizeof(float));

	debug2("Calculation: Finished initializing stuff\n");

	for (i = lb_i; i <= ub_i; i++)
	{
		// Calculate theta_i
		tmpTheta_i = (vector float) {0.0, 0.0, 0.0, 0.0};
		basei = matrix + i * Nexpanded * sizeof(float);
		mfc_get((void *) localRowi1, basei, 128, tags[0], 0, 0);
		for (kouter = 0; kouter < Nexpanded; kouter+= 32)
		{
			// Double-buffer dma calls
			if (kouter + 32 < Nexpanded)
			{
				mfc_get((void *) localRowi2, basei + (kouter + 32) * sizeof(float), 128, tags[1], 0, 0);
			}
			waitfor_matrix_io(tags[0]);
			for (kinner = 0; kinner < 8; kinner++)
			{
				tmpTheta_i = spu_madd(localRowi1[kinner], localRowi1[kinner], tmpTheta_i);
			}
			kouter += 32;
			if (kouter + 32 < Nexpanded)
			{
				mfc_get((void *) localRowi1, basei + (kouter + 32) * sizeof(float), 128, tags[0], 0, 0);
			}
			if (kouter < Nexpanded)
			{
				waitfor_matrix_io(tags[1]);
				for (kinner = 0; kinner < 8; kinner++)
				{
					tmpTheta_i = spu_madd(localRowi2[kinner], localRowi2[kinner], tmpTheta_i);
				}
			}
		}
		theta_i[i % R] += spu_extract(tmpTheta_i, 0) + spu_extract(tmpTheta_i, 1) + spu_extract(tmpTheta_i, 2) + spu_extract(tmpTheta_i, 3);

		for (j = lb_j; j <= ub_j; j++)
		{
			if (i < j)
			{
				tmpG = (vector float) {0.0, 0.0, 0.0, 0.0};
				basei = matrix + i * Nexpanded * sizeof(float);
				basej = matrix + j * Nexpanded * sizeof(float);
				mfc_get((void *) localRowi1, basei, 128, tags[0], 0, 0);
				mfc_get((void *) localRowj1, basej, 128, tags[1], 0, 0);
				for (kouter = 0; kouter < Nexpanded; kouter+= 32)
				{
					// Double-buffer
					if (kouter + 32 < Nexpanded)
					{
						mfc_get((void *) localRowi2, basei + (kouter + 32) * sizeof(float), 128, tags[2], 0, 0);
						mfc_get((void *) localRowj2, basej + (kouter + 32) * sizeof(float), 128, tags[3], 0, 0);
					}
					waitfor_matrix_io(tags[0]);
					waitfor_matrix_io(tags[1]);
					for (kinner = 0; kinner < 8; kinner++)
					{
						data_i = localRowi1[kinner];
						data_j = localRowj1[kinner];
						tmpG = spu_madd(data_i, data_j, tmpG);
					}
					kouter += 32;
					if (kouter + 32 < Nexpanded)
					{
						mfc_get((void *) localRowi1, basei + (kouter + 32) * sizeof(float), 128, tags[0], 0, 0);
						mfc_get((void *) localRowj1, basej + (kouter + 32) * sizeof(float), 128, tags[1], 0, 0);
					}
					if (kouter < Nexpanded)
					{
						waitfor_matrix_io(tags[2]);
						waitfor_matrix_io(tags[3]);
						for (kinner = 0; kinner < 8; kinner++)
                    	{
                    	    data_i = localRowi2[kinner];
                    	    data_j = localRowj2[kinner];
							tmpG = spu_madd(data_i, data_j, tmpG);
                    	}
					}
				}
				G_matrix[i % R][j % R] += spu_extract(tmpG, 0) + spu_extract(tmpG, 1) + spu_extract(tmpG, 2) + spu_extract(tmpG, 3);
			}
		}
	}

	for (j = lb_j; j <= ub_j; j++)
	{
		tmpTheta_j = (vector float) {0.0, 0.0, 0.0, 0.0};
		basej = matrix + j * Nexpanded * sizeof(float);
		mfc_get((void *) localRowj1, basej, 128, tags[0], 0, 0);
		for (kouter = 0; kouter < Nexpanded; kouter+= 32)
		{
			// Double-buffer dma calls
			if (kouter + 32 < Nexpanded)
			{
				mfc_get((void *) localRowj2, basej + (kouter + 32) * sizeof(float), 128, tags[1], 0, 0);
			}
			waitfor_matrix_io(tags[0]);
			for (kinner = 0; kinner < 8; kinner++)
			{
				tmpTheta_j = spu_madd(localRowj1[kinner], localRowj1[kinner], tmpTheta_j);
			}
			kouter += 32;
			if (kouter + 32 < Nexpanded)
			{
				mfc_get((void *) localRowj1, basej + (kouter + 32) * sizeof(float), 128, tags[0], 0, 0);
			}
			if (kouter < Nexpanded)
			{
				waitfor_matrix_io(tags[1]);
				for (kinner = 0; kinner < 8; kinner++)
				{
					tmpTheta_j = spu_madd(localRowj2[kinner], localRowj2[kinner], tmpTheta_j);
				}
			}
		}
		theta_j[j % R] += spu_extract(tmpTheta_j, 0) + spu_extract(tmpTheta_j, 1) + spu_extract(tmpTheta_j, 2) + spu_extract(tmpTheta_j, 3);
	}
	
	//printf("Calculation: Finished calculating theta vectors and P matrix\n");

	// Now the norms and dotproducts should be calculated, so we can calculate the jacobi values
	for (i = lb_i; i <= ub_i; i++)
	{
		for (j = lb_j; j <= ub_j; j++)
		{
			if (i < j)
			{
				a = theta_i[i % R];
				b = theta_j[j % R];
				g = G_matrix[i % R][j % R];
	
				debug2("a = norm(row_block[i, %d]) = %f\n", i, a);
				debug2("b = norm(row_block[j, %d]) = %f\n", j, b);
				debug2("g = dotprod(row_block[i, %d], row_block[j, %d]) = %f\n", i, j, g);
	
				if (fabs(g) > delta)
				{
					converged = false;
				}
	
				if (fabs(g) > EPSILON)
				{
					MATRIX_ELEMENT(P_matrix, i % R, j % R, R) = jacobi(a, b, g);
				}
				else
				{
					MATRIX_ELEMENT(P_matrix, i % R, j % R, R).real = 1.0;
					MATRIX_ELEMENT(P_matrix, i % R, j % R, R).imag = 0.0;
				}
	
				debug2("P for row pair(%d, %d) = (%f, %f)\n", i, j, 
					MATRIX_ELEMENT(P_matrix, i % R, j % R, R).real, MATRIX_ELEMENT(P_matrix, i % R, j % R, R).imag);
			}
		}
	}

	return converged;
}

void hestenes_jacobi_application(unsigned long long matrix, const size_t lb_i, const size_t ub_i, 
	const size_t lb_j, const size_t ub_j, complex_float_matrix_t P_matrix,
	const size_t Nexpanded, const size_t R)
{
	debug1("hestenes_jacobi_application [%d, %d] x [%d, %d]\n", lb_i, ub_i, lb_j, ub_j);

	size_t i, j, kouter, kinner;
	vector float data_i, data_j;
	complex_float_t x;
	vector float localRowi1[32] __attribute__((aligned(128)));
	vector float localRowj1[32] __attribute__((aligned(128)));
	vector float localRowi2[32] __attribute__((aligned(128)));
	vector float localRowj2[32] __attribute__((aligned(128)));
	unsigned long long basei, basej;

	for (i = lb_i; i <= ub_i; i++)
	{
		for (j = lb_j; j <= ub_j; j++)
		{
			if (i < j)
			{
				x = MATRIX_ELEMENT(P_matrix, i % R, j % R, R);
				basei = matrix + i * Nexpanded * sizeof(float);
				basej = matrix + j * Nexpanded * sizeof(float);
				mfc_get((void *) localRowi1, basei, 128, tags[0], 0, 0);
				mfc_get((void *) localRowj1, basej, 128, tags[1], 0, 0);
				for (kouter = 0; kouter < Nexpanded; kouter+= 32)
				{
					// Double-buffer
					if (kouter + 32 < Nexpanded)
					{
						mfc_get((void *) localRowi2, basei + (kouter + 32) * sizeof(float), 128, tags[2], 0, 0);
						mfc_get((void *) localRowj2, basej + (kouter + 32) * sizeof(float), 128, tags[3], 0, 0);
					}
					waitfor_matrix_io(tags[0]);
					waitfor_matrix_io(tags[1]);
					for (kinner = 0; kinner < 8; kinner++)
					{
						data_i = localRowi1[kinner];
						data_j = localRowj1[kinner];

						localRowi1[kinner] = spu_sub(spu_mul(spu_splats(x.real), data_i), spu_mul(spu_splats(x.imag), data_j));
						localRowj1[kinner] = spu_add(spu_mul(spu_splats(x.imag), data_i), spu_mul(spu_splats(x.real), data_j));
					}
					waitfor_matrix_io(tags[4]);
					mfc_put((void *) localRowi1, basei + kouter * sizeof(float), 128, tags[4], 0, 0);
					waitfor_matrix_io(tags[5]);
					mfc_put((void *) localRowj1, basej + kouter * sizeof(float), 128, tags[5], 0, 0);
					kouter += 32;
					if (kouter + 32 < Nexpanded)
					{
						mfc_get((void *) localRowi1, basei + (kouter + 32) * sizeof(float), 128, tags[0], 0, 0);
						mfc_get((void *) localRowj1, basej + (kouter + 32) * sizeof(float), 128, tags[1], 0, 0);
					}
					if (kouter < Nexpanded)
					{
						waitfor_matrix_io(tags[2]);
						waitfor_matrix_io(tags[3]);
						for (kinner = 0; kinner < 8; kinner++)
						{
							data_i = localRowi2[kinner];
							data_j = localRowj2[kinner];

							localRowi2[kinner] = spu_sub(spu_mul(spu_splats(x.real), data_i), spu_mul(spu_splats(x.imag), data_j));
							localRowj2[kinner] = spu_add(spu_mul(spu_splats(x.imag), data_i), spu_mul(spu_splats(x.real), data_j));
						}
						waitfor_matrix_io(tags[6]);
						mfc_put((void *) localRowi2, basei + kouter * sizeof(float), 128, tags[6], 0, 0);
						waitfor_matrix_io(tags[7]);
						mfc_put((void *) localRowj2, basej + kouter * sizeof(float), 128, tags[7], 0, 0);
					}
				}
			}
		}
	}
	waitfor_matrix_io(tags[4]);
	waitfor_matrix_io(tags[5]);
	waitfor_matrix_io(tags[6]);
	waitfor_matrix_io(tags[7]);
}

bool process_work_block(unsigned long long matrix, const size_t lb_i, const size_t ub_i, 
	const size_t lb_j, const size_t ub_j, const float delta,
	const size_t Nexpanded, const size_t R)
{
	debug1("Processing work block [%d, %d] x [%d, %d]\n", lb_i, ub_i, lb_j, ub_j);

	bool converged = true;

	// TBD: May want to reuse these rather than create them for each job
	// G is a matrix of dotproducts, P is a matrix of jacobi calculations
	// These are stored between pair of rows from block i and block j
	complex_float_t P_matrix[R][R];

	// Perform the HJ calculation, then the HJ application
	converged &= hestenes_jacobi_calculation(matrix, lb_i, ub_i, lb_j, ub_j, (complex_float_matrix_t) P_matrix, delta,
		Nexpanded, R);
	hestenes_jacobi_application(matrix, lb_i, ub_i, lb_j, ub_j, (complex_float_matrix_t) P_matrix, Nexpanded, R);

	return converged;
}

float calculate_norm2(unsigned long long matrix, const unsigned int row, const unsigned int Nexpanded)
{
	vector float vnorm2;
	size_t kouter, kinner;
	unsigned long long base;
	vector float data;
	vector float localRow1[8] __attribute__((aligned(128)));
	vector float localRow2[8] __attribute__((aligned(128)));

	vnorm2 = (vector float) {0.0, 0.0, 0.0, 0.0};

	base = matrix + (row * Nexpanded * sizeof(float));
	mfc_get((void *) localRow1, base, 128, tags[0], 0, 0);
	for (kouter = 0; kouter < Nexpanded; kouter+= 32)
	{
		// double-buffer
		if (kouter + 32 < Nexpanded)
		{
			mfc_get((void *) localRow2, base + (kouter + 32) * sizeof(float), 128, tags[1], 0, 0);
		}
		waitfor_matrix_io(tags[0]);
		for (kinner = 0; kinner < 8; kinner++)
		{
			data = localRow1[kinner];
			vnorm2 = spu_madd(data, data, vnorm2);
		}
		kouter += 32;
		if (kouter + 32 < Nexpanded)
		{
			mfc_get((void *) localRow1, base + (kouter + 32) * sizeof(float), 128, tags[0], 0, 0);
		}
		if (kouter < Nexpanded)
		{
			waitfor_matrix_io(tags[1]);
			for (kinner = 0; kinner < 8; kinner++)
			{
				data = localRow2[kinner];
				vnorm2 = spu_madd(data, data, vnorm2);
			}
		}
	}
	return spu_extract(vnorm2, 0) + spu_extract(vnorm2, 1) + spu_extract(vnorm2, 2) + spu_extract(vnorm2, 3);
}

int main(unsigned long long speid, addr64 argp, addr64 envp __attribute__((unused))) 
{
	unsigned int i, lb_i, ub_i, lb_j, ub_j, Nexpanded, R, M, NUM_SPE;
	float delta;
	unsigned int mb_value;
	bool finished = false;
	bool converged;
	unsigned long long matrix;

	/* Here is the actual DMA call */
	/* the first parameter is the address in local store to place the data */
	/* the second parameter holds the main memory address                  */
	/* the third parameter holds the number of bytes to DMA                */
	/* the fourth parameter identifies a "tag" to associate with this DMA  */
	/* (this should be a number between 0 and 31, inclusive)               */
	/* the last two parameters are only useful if you've implemented your  */
	/* own cache replacement management policy.  Otherwise set them to 0.  */

	for (i = 0; i < 8; i++)
	{
		tags[i] = mfc_tag_reserve();
		if (tags[i] == MFC_TAG_INVALID)
		{
			printf("SPE with ID %lld was not able to reserve a tag, exiting\n", speid);
			return -1;
		}
	}

	mfc_get((void*)&args, argp.ull, 128, tags[0], 0, 0);
	waitfor_matrix_io(tags[0]);
	matrix = args.initargs.matrix;
	R = args.initargs.R;
	M = args.initargs.M;
	Nexpanded = args.initargs.Nexpanded;
	NUM_SPE = args.initargs.NUM_SPE;
	printf("args: 0x%llx %d %d\n", matrix, R, Nexpanded);fflush(stdout);

	// First order of business - leave a mail for the PPE saying we are ready to process work
	// By default, assume convergence (until we detect otherwise)
	spu_writech(SPU_WrOutMbox, true);

	do
	{
		//printf("SPE(%lld): Waiting for inbound mail\n", speid);
		mb_value = spu_readch(SPU_RdInMbox);
		//printf("SPE(%lld): Received inbound mail %d\n", speid, mb_value);

		if (mb_value == WORK_CHUNK_READY)
		{
			//printf("Fetching a work chunk...\n");

			mfc_get((void*)&args, argp.ull, 128, tags[0], 0, 0);
			waitfor_matrix_io(tags[0]);

			lb_i = args.runargs.lb_i;
			ub_i = args.runargs.ub_i;
			lb_j = args.runargs.lb_j;
			ub_j = args.runargs.ub_j;
			delta = args.runargs.delta;

			/*printf("SPE(%lld): Data received is: [%d, %d] [%d, %d] delta = %g\n", 
				speid,
				(int)lb_i,
				(int)ub_i,
				(int)lb_j,
				(int)ub_j,
				delta);fflush(stdout);
			*/
			converged = process_work_block(matrix, lb_i, ub_i, lb_j, ub_j, delta, Nexpanded, R);
			//converged = true;

			/*printf("SPE(%lld): Finished processing a job, sending %d to outbox\n", 
				speid,
				converged);
			*/
			// send a mail with the value of converged (true or false)
			spu_writech(SPU_WrOutMbox, converged);
		}
		else if (mb_value == CALCULATE_NORMS2)
		{
			unsigned int startat;
			unsigned long long output;
			norm2reply_t runningtotal __attribute__((aligned(128)));
			runningtotal.f = 0.0;
			mfc_get((void*)&args, argp.ull, 128, tags[0], 0, 0);
			waitfor_matrix_io(tags[0]);
			output = args.normargs.output;
			startat = args.normargs.startat;
			for (i = startat; i < M; i += NUM_SPE)
			{
				runningtotal.f += calculate_norm2(matrix, i, Nexpanded);
			}
			mfc_put((void*) &runningtotal, output, 16, tags[0], 0, 0);
			waitfor_matrix_io(tags[0]);
			spu_write_out_mbox(1);
		}
		else if (mb_value == NO_MORE_WORK)
		{
			finished = true;
		}
		else
		{
			printf("SPE(%lld): Got an unidentified mail message: %d\n", speid, mb_value);
		}
	} while(!finished);

	printf("SPE(%lld): Finished\n", speid);

	return 0;
}

